#' Global palette and theme
base_palette <- brewer.pal(n = 8, name = "Dark2")
scale_fill_discrete <- function(...) scale_fill_manual(values = base_palette, ...)
scale_color_discrete <- function(...) scale_color_manual(values = base_palette, ...)
theme_set(theme_minimal(base_size = 14))
hospital_data <- read.csv("./project_hospitals.csv", header = TRUE)
summary(hospital_data)
## ID Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds
## Min. : 1 Min. : 6.700 Min. :38.80 Min. :1.300 Min. : 1.60 Min. : 39.60 Min. : 29.0
## 1st Qu.: 29 1st Qu.: 8.340 1st Qu.:50.90 1st Qu.:3.700 1st Qu.: 8.40 1st Qu.: 69.50 1st Qu.:106.0
## Median : 57 Median : 9.420 Median :53.20 Median :4.400 Median :14.10 Median : 82.30 Median :186.0
## Mean : 57 Mean : 9.648 Mean :53.23 Mean :4.355 Mean :15.79 Mean : 81.63 Mean :252.2
## 3rd Qu.: 85 3rd Qu.:10.470 3rd Qu.:56.20 3rd Qu.:5.200 3rd Qu.:20.30 3rd Qu.: 94.10 3rd Qu.:312.0
## Max. :113 Max. :19.560 Max. :65.90 Max. :7.800 Max. :60.50 Max. :133.50 Max. :835.0
## Med.Sc.Aff Region Avg.Pat Avg.Nur Pct.Ser.Fac
## Min. :1.00 Min. :1.000 Min. : 20.0 Min. : 14.0 Min. : 5.70
## 1st Qu.:2.00 1st Qu.:2.000 1st Qu.: 68.0 1st Qu.: 66.0 1st Qu.:31.40
## Median :2.00 Median :2.000 Median :143.0 Median :132.0 Median :42.90
## Mean :1.85 Mean :2.363 Mean :191.4 Mean :173.2 Mean :43.16
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:252.0 3rd Qu.:218.0 3rd Qu.:54.30
## Max. :2.00 Max. :4.000 Max. :791.0 Max. :656.0 Max. :80.00
colSums(is.na(hospital_data))
## ID Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds Med.Sc.Aff Region
## 0 0 0 0 0 0 0 0 0
## Avg.Pat Avg.Nur Pct.Ser.Fac
## 0 0 0
vis_miss(hospital_data)
Our data set hospitalData from the csv
HospitalDurations contains 113 observations of 12
variables. We have no missing values for any variable, thus we do not
need to impute for any observations. Out of the 12 variables, one of
them is an identifier variable ID, and will not be used for
predicting. The patient’s length of stay Lgth.of.Sty is the
variable we would like to predict for. Therefor, we have 10 potential
explanatory variables and 1 variable we are trying to predict for.
ggplot(hospital_data, aes(x = Lgth.of.Sty, fill = factor(1))) +
geom_histogram(binwidth = 2, color = "black") +
labs(x = "Length of Stay (Days)", y = "Count", title = "") +
scale_fill_manual(values = base_palette[3]) +
guides(fill = "none")
We see some slight right skew for Lgt.of.Sty
base_data <- hospital_data |> select(-ID)
#base_data$RegionRaw <- base_data$Region
base_data$Region <- as.factor(base_data$Region) # convert Region from int to factor
base_data$Med.Sc.Aff <- as.factor(base_data$Med.Sc.Aff) # convert Med.Sc.Aff from int to factor
base_data$Region <- revalue(base_data$Region, c("1" = "NE", "2" = "NC", "3" = "S", "4" = "W"))
base_data$Med.Sc.Aff <- revalue(base_data$Med.Sc.Aff, c("1" = "Yes", "2" = "No"))
# labels for graphs
base_data_labels <- c(
"Length of Stay",
"Age",
"Infection Risk",
"Routine culturing ratio",
"Routine chest X-ray ratio",
"Number of beds",
"Medical School Affiliation",
"Region",
"Average Daily census",
"Number of nurses",
"Available facilities"
)
summary(base_data) # sanity check
## Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds Med.Sc.Aff Region
## Min. : 6.700 Min. :38.80 Min. :1.300 Min. : 1.60 Min. : 39.60 Min. : 29.0 Yes:17 NE:28
## 1st Qu.: 8.340 1st Qu.:50.90 1st Qu.:3.700 1st Qu.: 8.40 1st Qu.: 69.50 1st Qu.:106.0 No :96 NC:32
## Median : 9.420 Median :53.20 Median :4.400 Median :14.10 Median : 82.30 Median :186.0 S :37
## Mean : 9.648 Mean :53.23 Mean :4.355 Mean :15.79 Mean : 81.63 Mean :252.2 W :16
## 3rd Qu.:10.470 3rd Qu.:56.20 3rd Qu.:5.200 3rd Qu.:20.30 3rd Qu.: 94.10 3rd Qu.:312.0
## Max. :19.560 Max. :65.90 Max. :7.800 Max. :60.50 Max. :133.50 Max. :835.0
## Avg.Pat Avg.Nur Pct.Ser.Fac
## Min. : 20.0 Min. : 14.0 Min. : 5.70
## 1st Qu.: 68.0 1st Qu.: 66.0 1st Qu.:31.40
## Median :143.0 Median :132.0 Median :42.90
## Mean :191.4 Mean :173.2 Mean :43.16
## 3rd Qu.:252.0 3rd Qu.:218.0 3rd Qu.:54.30
## Max. :791.0 Max. :656.0 Max. :80.00
cor_matrix = cor(base_data |> select(-Region, -Med.Sc.Aff))
ggcorrplot(cor_matrix, lab = TRUE)
Length of Stay correlated more strongly with: + Infection Risk, + Average Patients, - Region (however, this is not a numerical variable but rather a categorical variable and should be discounted)
pairs(base_data |> select(-Region, -Med.Sc.Aff), panel = function(x, y) {
points(x, y, pch = 16, col = "blue") # Scatter points
abline(lm(y ~ x), col = "red") # Add trend line
})
ggpairs(base_data, lower = list(continuous = wrap("smooth", method = "lm", color = "blue")))
The variables Number of Beds, Average Patients, Average Nurses, and Percentage of service facilities; exhibit high intercorrelation, particularly between Number of Beds and Average Patients (0.981), which could introduce collinearity in a predictive model.
An important strong correlation that appears to be an important factor in hospital stays is Infection Risk (0.533).
Both Culture Rate and Chest X-Ray Rate displays moderate correlation with Infection Risk; thhis suggests an intuitive relationship: as infection risk increases, more diagnostic tests are performed. Infection Risk may also serve as a useful predictor for Length of Stay, as higher infection risks could lead to prolonged hospital stays.
cor_data <- base_data %>% select(-Med.Sc.Aff, -Region)
lin_log_cor_data <- log(cor_data)
lin_log_cor_data$Lgth.of.Sty <- cor_data$Lgth.of.Sty
log_lincor_data <- cor_data %>% select(-Lgth.of.Sty)
log_lincor_data$logLgth.of.Sty <- log(cor_data$Lgth.of.Sty)
correlation_matrix <- cor(cor_data, method = "pearson")
ggcorrplot(correlation_matrix, lab = TRUE, title = "Linear-Linear")
lin_log_correlation_matrix <- cor(lin_log_cor_data, method = "pearson")
ggcorrplot(lin_log_correlation_matrix, lab = TRUE, title = "Linear-Log")
log_lin_correlation_matrix <- cor(log_lincor_data, method = "pearson")
ggcorrplot(log_lin_correlation_matrix, lab = TRUE, title = "Log-Linear")
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")
ggplot(long_data, aes(x = Value, y = Lgth.of.Sty)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess") +
facet_wrap(~ Variable, scales = "free_x") + # Auto-arrange subplots
theme_minimal()
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")
ggplot(long_data, aes(x = Value, y = Lgth.of.Sty)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess") +
facet_wrap(~ Variable, scales = "free_x") + # Auto-arrange subplots
theme_minimal()
# Define color palette
plot_colors <- RColorBrewer::brewer.pal(n = 16, name = "Set2")
# Convert data to long format for faceting
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value") %>%
filter(Value > 0, Lgth.of.Sty > 0) # Remove zero or negative values
# Create the faceted ggplot
ggplot(long_data, aes(x = log(Value), y = log(Lgth.of.Sty))) +
geom_point(alpha = 0.6, aes(color = Variable)) + # Color points by variable
geom_smooth(method = "loess", se = FALSE, aes(color = Variable)) + # Fit smoothing curve
scale_color_manual(values = plot_colors) + # Apply manual color mapping
facet_wrap(~ Variable, scales = "free_x") + # Create faceted plots
ggtitle("Log-Transformed Variables vs Log Length of Stay") +
xlab("Log Predictor Variables") +
ylab("Log Length of Stay (Days)") +
theme_minimal() +
theme(legend.position = "none")
region_patient <- ggplot(data = base_data, aes(x = Region, y = Lgth.of.Sty, colour = Med.Sc.Aff)) +
geom_boxplot() +
ggtitle("Region vs Patient Length of Stay", "Grouped by Medical School Affiliation")
med_assoc <- ggplot(data = base_data, aes(x = Med.Sc.Aff, y = Lgth.of.Sty, colour = Region)) +
geom_boxplot() +
ggtitle("Med. School Aff. vs Pat, Length of Stay", "Grouped by Region")
grid.arrange(grobs = list(med_assoc, region_patient), ncol = 1)
By looking at the distributions (via the boxplots), we can see that there are some differences between a patient’s length of stay if we group by Region (or by Medical School Affiliation).
If grouping by Medical school affiliation: Length of stay between the regions is fairly similar on average if there is an affiliation for medical school, but the distributions are much more different between the regions. If there is no affiliation, then the distributions are a bit more similar, but the averages are different between each region.
If grouping by Region: Length of stay on average tends to be similar between the regions (being slightly less in the W and S regions) with NE having a wider distribution of length of stay. Interesting to note that medical school affiliation tends to have lower length of stay if the affiliation is none.
plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
x_var <- names(base_data)[indexes[i]]
temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Region")) +
geom_point() +
geom_smooth() +
ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Region") +
xlab(base_data_labels[i]) +
ylab("Length of Stay (Days)") +
theme(legend.position = "right")
plot_list[[length(plot_list) + 1]] <- temp_plot
}
grid.arrange(grobs = plot_list, ncol = 2)
plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
x_var <- names(base_data)[indexes[i]]
temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Med.Sc.Aff")) +
geom_point() +
geom_smooth() +
ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Medical School Affiliation") +
xlab(base_data_labels[i]) +
ylab("Length of Stay (Days)") +
theme(legend.position = "right")
plot_list[[length(plot_list) + 1]] <- temp_plot
}
grid.arrange(grobs = plot_list, ncol = 2)
all_vars_model = lm(Lgth.of.Sty ~ ., data = base_data)
par(mfrow = c(2,2))
plot(all_vars_model)
summary(all_vars_model)
##
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = base_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3048 -0.6608 -0.0272 0.5862 6.3001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.322292 1.782122 1.864 0.065222 .
## Age 0.079922 0.028266 2.827 0.005668 **
## Inf.Risk 0.439665 0.127298 3.454 0.000812 ***
## R.Cul.Rat 0.005546 0.015982 0.347 0.729299
## R.CX.ray.Rat 0.012688 0.007147 1.775 0.078892 .
## N.Beds -0.004851 0.003603 -1.346 0.181224
## Med.Sc.AffNo -0.266644 0.441089 -0.605 0.546872
## RegionNC -0.812966 0.351406 -2.313 0.022744 *
## RegionS -1.158277 0.351704 -3.293 0.001370 **
## RegionW -1.880560 0.444136 -4.234 5.1e-05 ***
## Avg.Pat 0.015182 0.004424 3.432 0.000872 ***
## Avg.Nur -0.005891 0.002218 -2.656 0.009203 **
## Pct.Ser.Fac -0.012179 0.013774 -0.884 0.378698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.231 on 100 degrees of freedom
## Multiple R-squared: 0.6299, Adjusted R-squared: 0.5855
## F-statistic: 14.18 on 12 and 100 DF, p-value: < 2.2e-16
The diagnostic plots suggest the linear model fits reasonably well.
The Residuals vs. Fitted and Scale-Location plots show residuals scattered around zero, with no strong signs of nonlinearity or heteroskedasticity. However, Observation 47 stands out, showing a large residual across multiple plots. In the Q–Q plot, most points follow the regression line, but Observation 47 appears on the right tail, deviating from normality.
Leverage diagnostics indicate that most data points are well-standardized around the fitted line. Observation 47 is near the threshold for Cook’s distance, suggesting potential influence, while Observation 112 exhibits high leverage without strong influence. These points may warrant further investigation to understand their impact on the model.
No major assumption violations are evident, but examining these observations could help assess model robustness.
vif(all_vars_model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.176172 1 1.084515
## Inf.Risk 2.154694 1 1.467888
## R.Cul.Rat 1.978520 1 1.406599
## R.CX.ray.Rat 1.416265 1 1.190070
## N.Beds 35.699204 1 5.974881
## Med.Sc.Aff 1.855334 1 1.362107
## Region 1.715222 3 1.094091
## Avg.Pat 34.211423 1 5.849053
## Avg.Nur 7.055523 1 2.656224
## Pct.Ser.Fac 3.241812 1 1.800503
Number of Beds and Average Patients have extremely high VIFs, suggesting high multicollinearity. This makes sense, as the more room a hospital has (number of beds) then the more patients they can have on average. Average number of full time nurses has potential collinearity as well, which also makes sense as a hospital with more beds will likely have more nursing staff to take care of the patients.
test_model <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_fit <- lm(test_model, data = base_data)
par(mfrow = c(2,2))
plot(test_model_fit)
par(mfrow = c(1,1))
In comparing the first model (all variables model) and this model, the residuals vs. fitted plot now shows points more evenly scattered around the reference line, indicating fewer systematic patterns in the residuals and a flatter loess curve.
The Q–Q plot remains largely linear, though observations such as #47 and #112 still appear in the right tail, suggesting they continue to exert some influence.
The scale–location plot demonstrates that the spread of residuals has become more consistent across fitted values, with a flatter red line that signals more homoscedasticity than before.
The residuals vs. leverage plot, #47 no longer rises as close to the typical Cook’s distance threshold, while #112 has shifted from being primarily a high‐leverage point to one with a moderately large residual—though neither appears as influential as in the initial model.
summary(test_model_fit)
##
## Call:
## lm(formula = test_model, data = base_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7517 -0.8456 -0.0709 0.5848 6.9770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.025158 1.951126 2.063 0.04165 *
## Age 0.078458 0.031248 2.511 0.01362 *
## Inf.Risk 0.576846 0.136458 4.227 5.16e-05 ***
## R.Cul.Rat -0.013363 0.017119 -0.781 0.43682
## R.CX.ray.Rat 0.011462 0.007911 1.449 0.15045
## Avg.Nur 0.001031 0.001621 0.636 0.52605
## Pct.Ser.Fac -0.003332 0.014373 -0.232 0.81716
## Med.Sc.AffNo -0.970771 0.460747 -2.107 0.03757 *
## RegionNC -0.962513 0.374275 -2.572 0.01156 *
## RegionS -1.135900 0.376770 -3.015 0.00324 **
## RegionW -2.511570 0.459006 -5.472 3.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.363 on 102 degrees of freedom
## Multiple R-squared: 0.5368, Adjusted R-squared: 0.4914
## F-statistic: 11.82 on 10 and 102 DF, p-value: 2.727e-13
vif(test_model_fit)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.171453 1 1.082337
## Inf.Risk 2.017887 1 1.420523
## R.Cul.Rat 1.850070 1 1.360173
## R.CX.ray.Rat 1.414378 1 1.189276
## Avg.Nur 3.071224 1 1.752491
## Pct.Ser.Fac 2.877054 1 1.696188
## Med.Sc.Aff 1.649862 1 1.284470
## Region 1.381390 3 1.055325
numeric_data <- base_data |> select(where(is.numeric))
stats <- numeric_data |>
dplyr::summarise(
dplyr::across(
.cols = everything(),
.fns = list(
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE)
)
)
) |>
pivot_longer(
cols = everything(),
names_to = c("variable", ".value"),
names_pattern = "(.*)_(mean|median)"
)
outliers_diff <- numeric_data[c(47, 112), ] |>
as_tibble(rownames = "obs_id") %>%
pivot_longer(
cols = -obs_id,
names_to = "variable",
values_to = "value"
) |>
left_join(stats, by = "variable") |>
dplyr::mutate(
diff_from_mean = value - mean,
diff_from_median = value - median
)
final_table <- outliers_diff |>
pivot_wider(
id_cols = c(variable, mean, median),
names_from = obs_id,
values_from = c(value, diff_from_mean, diff_from_median)
) |>
dplyr::mutate(dplyr::across(where(is.numeric), ~ round(.x, 3))) |>
dplyr::rename(
"#47 Value" = value_47,
"#47 Mean Distance" = diff_from_mean_47,
"#47 Median Distance" = diff_from_median_47,
"#112 Value" = value_112,
"#112 Mean Distance" = diff_from_mean_112,
"#112 Median Distance" = diff_from_median_112,
) |>
dplyr::select(
variable, mean, median,
`#112 Value`, `#47 Value`, `#112 Mean Distance`, `#47 Mean Distance`, `#112 Median Distance`, `#47 Median Distance`,
)
datatable(final_table, options = list(dom = "t", width = "100%", scrollX = TRUE))
test_model_remove <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_remove_fit <- lm(test_model_remove, data = base_data[c(-112, -47), ])
par(mfrow = c(2,2))
plot(test_model_remove_fit)
par(mfrow = c(1,1))
After removing observations #47 and #112, the model’s residual diagnostics show notable improvements.
Residuals vs. Fitted plot indicates a reduction in heteroscedasticity, as the variance of residuals is now more evenly spread with less curvature in the trend.
The Q-Q plot aligns more closely with the normal distribution, suggesting an improvement in the normality assumption, as the extreme deviations from #47 and #112 are no longer distorting the distribution.
In the Scale-Location plot, variance appears more stable, reinforcing the improved homoscedasticity.
Finally, the Residuals vs. Leverage plot confirms that these observations had high leverage and strong influence on model coefficients, as their removal results in a more balanced distribution of leverage across data points.
summary(test_model_remove_fit)
##
## Call:
## lm(formula = test_model_remove, data = base_data[c(-112, -47),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.19321 -0.65320 -0.06784 0.51335 3.01070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.228576 1.466415 3.566 0.000559 ***
## Age 0.054064 0.023485 2.302 0.023404 *
## Inf.Risk 0.469248 0.102685 4.570 1.40e-05 ***
## R.Cul.Rat -0.004346 0.012845 -0.338 0.735797
## R.CX.ray.Rat 0.008063 0.005934 1.359 0.177264
## Avg.Nur 0.001118 0.001213 0.922 0.358648
## Pct.Ser.Fac -0.002924 0.010752 -0.272 0.786261
## Med.Sc.AffNo -0.689680 0.349635 -1.973 0.051306 .
## RegionNC -0.544629 0.283414 -1.922 0.057494 .
## RegionS -0.758340 0.284514 -2.665 0.008968 **
## RegionW -2.069147 0.346224 -5.976 3.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.018 on 100 degrees of freedom
## Multiple R-squared: 0.567, Adjusted R-squared: 0.5237
## F-statistic: 13.09 on 10 and 100 DF, p-value: 2.325e-14
vif(test_model_remove_fit)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.158175 1 1.076185
## Inf.Risk 1.977350 1 1.406183
## R.Cul.Rat 1.850144 1 1.360200
## R.CX.ray.Rat 1.388417 1 1.178311
## Avg.Nur 3.005297 1 1.733579
## Pct.Ser.Fac 2.836364 1 1.684151
## Med.Sc.Aff 1.615906 1 1.271183
## Region 1.352387 3 1.051599
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2))
}
predictions_before <- predict(test_model_fit, base_data)
rmse_before <- RMSE(predictions_before, base_data$Lgth.of.Sty)
base_data_filtered <- base_data[c(-112, -47), ]
predictions_after <- predict(test_model_remove_fit, base_data_filtered)
rmse_after <- RMSE(predictions_after, base_data_filtered$Lgth.of.Sty)
sprintf("RMSE without removing = %.3f -- RMSE without 112 and 47 = %.3f", rmse_before, rmse_after)
## [1] "RMSE without removing = 1.295 -- RMSE without 112 and 47 = 0.966"
The reduction in RMSE from 1.295 to 0.966 after removing observations 47 and 112 suggests that these points were contributing to higher model error. Their removal improves the model’s predictive accuracy, indicating that it generalizes better to new data and produces more reliable estimates
Additionally by removing highly correlated variables such as Number of Beds and Average Patients, we’ve substantially lowered variance inflation. This indicates that our changes have mitigated multicollinearity and improved the stability and interpretability of the model.
interaction_data <- base_data[c(-112, -47), ]
anova_model <- lm(Lgth.of.Sty ~ ., data = interaction_data)
summary(anova_model)
##
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = interaction_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07731 -0.57223 -0.08947 0.61897 3.06561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9450368 1.4707217 3.362 0.001103 **
## Age 0.0578446 0.0233165 2.481 0.014811 *
## Inf.Risk 0.4226656 0.1053017 4.014 0.000117 ***
## R.Cul.Rat 0.0033790 0.0133374 0.253 0.800530
## R.CX.ray.Rat 0.0086477 0.0058544 1.477 0.142845
## N.Beds -0.0008609 0.0031127 -0.277 0.782699
## Med.Sc.AffNo -0.4888113 0.3610531 -1.354 0.178899
## RegionNC -0.5864174 0.2883010 -2.034 0.044649 *
## RegionS -0.8703365 0.2906440 -2.995 0.003480 **
## RegionW -1.8920505 0.3657131 -5.174 1.22e-06 ***
## Avg.Pat 0.0057002 0.0043733 1.303 0.195489
## Avg.Nur -0.0023495 0.0019917 -1.180 0.241002
## Pct.Ser.Fac -0.0095188 0.0113323 -0.840 0.402968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 98 degrees of freedom
## Multiple R-squared: 0.5878, Adjusted R-squared: 0.5373
## F-statistic: 11.64 on 12 and 98 DF, p-value: 3.301e-14
anova_results <- anova(anova_model)
vif(anova_model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.175247 1 1.084088
## Inf.Risk 2.140653 1 1.463097
## R.Cul.Rat 2.053431 1 1.432980
## R.CX.ray.Rat 1.391222 1 1.179501
## N.Beds 36.769932 1 6.063822
## Med.Sc.Aff 1.773930 1 1.331890
## Region 1.771611 3 1.100005
## Avg.Pat 43.286785 1 6.579269
## Avg.Nur 8.346041 1 2.888952
## Pct.Ser.Fac 3.243355 1 1.800932
mse <- anova_results["Residuals", "Mean Sq"]
rmse <- sqrt(mse)
sprintf("Anova RMSE %.3f", rmse)
## [1] "Anova RMSE 1.003"
Interpretations:
Patients in North Central hospitals stay approximately 3.02 days longer than those in the Northeast on average, with a statistically significant result. Similarly, patients in Southern hospitals have an average stay that is about 2.59 days longer than in the Northeast, also showing statistical significance. In the Western region, the average length of stay is around 3.5 days longer than in the Northeast; however, this result is only marginally significant.
lasso_data <- interaction_data
# Prepare data: Separate predictors (X) and target variable (y)
X <- as.matrix(lasso_data[, -which(names(lasso_data) == "Lgth.of.Sty")]) # Convert predictors to matrix
y <- interaction_data$Lgth.of.Sty # Target variable
# Perform LASSO regression with cross-validation
cv_lasso <- cv.glmnet(X, y, alpha = 1, nfolds = 10) # alpha = 1 for LASSO
# Plot cross-validation results
plot(cv_lasso)
# Find the optimal lambda (penalty parameter)
best_lambda <- cv_lasso$lambda.min
print(paste("Optimal lambda:", best_lambda))
## [1] "Optimal lambda: 0.00483579122437186"
# Fit final LASSO model using optimal lambda
lasso_model <- glmnet(X, y, alpha = 1, lambda = best_lambda)
# Extract feature coefficients
lasso_coefficients <- coef(lasso_model)
# Convert coefficients to a named vector
lasso_coefficients <- as.vector(lasso_coefficients)
names(lasso_coefficients) <- rownames(coef(lasso_model))
# Print Intercept
intercept <- lasso_coefficients["(Intercept)"]
print(paste("Intercept:", intercept))
## [1] "Intercept: 2.50577455201892"
# Print selected features and their coefficients
selected_features <- lasso_coefficients[lasso_coefficients != 0] # Non-zero coefficients
print("Selected Features and Coefficients:")
## [1] "Selected Features and Coefficients:"
print(selected_features)
## (Intercept) Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds Avg.Pat Avg.Nur
## 2.505774552 0.068425551 0.298458122 0.025309004 0.013993631 -0.004244115 0.011463667 -0.003267501
# Compute RMSE using cross-validation
predictions <- predict(lasso_model, X) # Predict on training data
rmse <- sqrt(mean((y - predictions)^2)) # RMSE calculation
print(paste("RMSE:", rmse))
## [1] "RMSE: 1.07226971046126"
# Compute R-squared
ss_total <- sum((y - mean(y))^2) # Total sum of squares
ss_residual <- sum((y - predictions)^2) # Residual sum of squares
r_squared <- 1 - (ss_residual / ss_total) # R-squared
# Compute Adjusted R-squared
n <- length(y) # Number of observations
p <- length(selected_features) - 1 # Number of selected predictors (excluding intercept)
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
print(paste("Adjusted R-squared:", adj_r_squared))
## [1] "Adjusted R-squared: 0.430271762415266"
# base_data %>% count(base_data$RegionAsFactor) %>% mutate(percentage = n / sum(n) * 100)
After analyzing outliers we are going ahead to work without both outliers (observations 47 and 112)
origCleaned <- base_data
cleaned_obj_2 <- base_data[c(-112, -47), ]
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_model <- train(
Lgth.of.Sty ~ .,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_model)
## k-Nearest Neighbors
##
## 111 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.636425 0.1570248 1.341632
## 2 1.422562 0.1693380 1.120702
## 3 1.433125 0.1277156 1.120258
## 4 1.361358 0.1735495 1.052855
## 5 1.340433 0.1906822 1.066184
## 6 1.338491 0.1898658 1.066725
## 7 1.308330 0.2283080 1.063999
## 8 1.309400 0.2201435 1.045493
## 9 1.306700 0.2295606 1.036747
## 10 1.325767 0.2110012 1.047278
## 20 1.314614 0.2099153 1.041218
## 30 1.291736 0.2333301 1.027280
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 30.
plot(knn_model)
knn_results <- knn_model$results
best_k <- knn_model$bestTune$k
best_mse <- min(knn_results$RMSE^2)
sprintf("Best k: %d, Best MSE: %.3f", best_k, best_mse)
## [1] "Best k: 30, Best MSE: 1.669"
When looking at all numeric predictors, the best k for RMSE is 30, however, we see that at k = 9, we also have quite a low RMSE as well.
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_model <- train(
Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + N.Beds + Med.Sc.Aff,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_model)
## k-Nearest Neighbors
##
## 111 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.607190 0.1527092 1.273367
## 2 1.415132 0.1777442 1.142162
## 3 1.355112 0.2226904 1.101688
## 4 1.343746 0.2298662 1.096834
## 5 1.314924 0.2434483 1.081474
## 6 1.303935 0.2417774 1.078696
## 7 1.284000 0.2547728 1.061646
## 8 1.274059 0.2704031 1.045664
## 9 1.279255 0.2676272 1.035398
## 10 1.267711 0.2791885 1.018944
## 20 1.307496 0.2181723 1.050198
## 30 1.288398 0.2367415 1.022785
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 10.
plot(knn_model)
knn_results <- knn_model$results
best_k <- knn_model$bestTune$k
best_mse <- min(knn_results$RMSE^2)
sprintf("Best k: %d, Best MSE: %.3f", best_k, best_mse)
## [1] "Best k: 10, Best MSE: 1.607"
Similar to looking at all predictors, for the predictors our LASSO model chose, we see again that at k = 30, our RMSE is lowest, however, at k = 9 we have a low RMSE as well.
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_model <- train(
Lgth.of.Sty ~ Age + Inf.Risk + Region + Med.Sc.Aff,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_model)
## k-Nearest Neighbors
##
## 111 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.794866 0.1190193 1.396178
## 2 1.435927 0.2136443 1.130902
## 3 1.366410 0.2279952 1.085376
## 4 1.360840 0.2363762 1.091787
## 5 1.325933 0.2258781 1.056191
## 6 1.266148 0.2922350 1.015717
## 7 1.296658 0.2620468 1.048769
## 8 1.336213 0.1995070 1.087715
## 9 1.327809 0.2070321 1.080879
## 10 1.344680 0.1797312 1.084815
## 20 1.338769 0.2516622 1.077299
## 30 1.398054 0.1439504 1.126062
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
plot(knn_model)
knn_results <- knn_model$results
best_k <- knn_model$bestTune$k
best_mse <- min(knn_results$RMSE^2)
sprintf("Best k: %d, Best MSE: %.3f", best_k, best_mse)
## [1] "Best k: 6, Best MSE: 1.603"
Interestingly, for the predictors of Age, Inf.Risk, Med.Sc.Aff, and Region, k = 10 has the best RMSE.